Christian Panse / Jonas Grossmann
2016/01/24
library(ISCB2017)
library(knitr)
library(lattice)unique(XIC$protein)## [1] P00331_ADH2 P04807_HXKB P33302_PDR5 P53929_YNK8
## Levels: P00331_ADH2 P04807_HXKB P33302_PDR5 P53929_YNK8
unique(XIC$peptide
)## [1] AGHWAAISGAAGGLGSLAVQYAK
## [2] AIIFYESNGK
## [3] ANGTVVLVGLPAGAK
## [4] ATNGGAHGIINVSVSEAAIEASTR
## [5] CSSDVFNHVVK
## [6] DIVSAVVK
## [7] EELFTSLGGEVFIDFTK
## [8] VLGIDGGPGK
## [9] VVGLSSLPEIYEK
## [10] YSGVCHTDLHAWHGDWPLPTK
## [11] AFIDEQFPQGISEPIPLGFTFSFPASQNK
## [12] DIYGWTQTSLDDYPIK
## [13] ELMQQIENFEK
## [14] ESGDFLAIDLGGTNLR
## [15] FDKPFVMDTSYPAR
## [16] GFDIPNIENHDVVPMLQK
## [17] IEEDPFENLEDTDDLFQNEFGINTTVQER
## [18] IFTVPTETLQAVTK
## [19] IVPAEDGSGAGAAVIAALAQK
## [20] LALMDMYK
## [21] LSDDIPPSAPMAINCEYGSFDNEHVVLPR
## [22] LSELIGAR
## [23] LSVCGIAAICQK
## [24] MGVIFGTGVNGAYYDVCSDIEK
## [25] MSSGYYLGEILR
## [26] NIPIEVVALINDTTGTLVASYYTDPETK
## [27] TGHIAADGSVYNR
## [28] TTQNPDELWEFIADSLK
## [29] YDITIDEESPRPGQQTFEK
## [30] AGTSLQGLQNQMLAVFMFTVIFNPILQQYLPSFVQQR
## [31] ARPSSPYTVSYMMQVK
## [32] AVQSELDWMER
## [33] CADYELLEFTPPSGMTCGQYMEPYLQLAK
## [34] CPADANPAEWMLEVVGAAPGSHANQDYYEVWR
## [35] EMNDYWVK
## [36] ESYANHLAEVAMATYGLSHTR
## [37] ETNTFQILKPMDGCLNPGELLVVLGRPGSGCTTLLK
## [38] FILTIFNQLFIGFTFFK
## [39] FPCAEYVPR
## [40] GDTSTFYFR
## [41] GEILVFPR
## [42] GEVVYNAEADVHLPHLTVFETLVTVAR
## [43] GFGIGMAYVVFFFFVYLFLCEYNEGAK
## [44] GIHIPQTPK
## [45] GPAYANISSTESVCTVVGAVPGQDYVLGDDFIR
## [46] GSAMFFAILFNAFSSLLEIFSLYEARPITEK
## [47] GSITAAEDK
## [48] GTYQYYHK
## [49] HEFSQSIIYQTK
## [50] ISYSGYSGDDIK
## [51] LANHGQAILCTIHQPSAILMQEFDR
## [52] LDPNSENFSSAAWVK
## [53] LIIAVCFNIIFYFLVDFR
## [54] LLNDDEASR
## [55] LLVFLDEPTSGLDSQTAWSICQLMK
## [56] LNNNVNDVTSYSSASSSTENAADLHNYNGFDEHTEAR
## [57] LTIGVELTAKPK
## [58] MLQESSEEESDTYGEIGLSK
## [59] NANDPENVGER
## [60] NGGVFFFYLLINIVAVFSMSHLFR
## [61] NLCYEVQIK
## [62] NLSASGASADVAYQSTVVNIPYK
## [63] NMAHLSAADPDFYKPYSLGCAWK
## [64] NNIGFTLFMILGNCSMALILGSMFFK
## [65] NYGIFICYIAFNYIAGVFFYWLAR
## [66] QPAEVSIEEK
## [67] QTTADFLTSVTSPSER
## [68] SDAQSIFSSGVEGVNPIFSDPEAPGYDPK
## [69] SDLSSDR
## [70] SISSNTHGFDLGADTK
## [71] TGYLTDENATDTCSFCQISTTNDYLANVNSFYSER
## [72] TLSEAMVPASMLLLALSMYTGFAIPK
## [73] TLTAQSMQNSTQSAPNK
## [74] TMIDYFESHGAHK
## [75] TQADISNTSATVAIYQCSQDAYDLFNK
## [76] TYSLYHPSADAFASVLSEIPSK
## [77] VCVLDDGYQIYYGPADK
## [78] VGNDIVR
## [79] VSPLTYFIQALLAVGVANVDVK
## [80] VTMGVITGDILVNGIPR
## [81] WIWYINPLAYLFESLLINEFHGIK
## [82] YADAVVGVAGEGLNVEQR
## [83] YFEDMGYVCPSR
## [84] CLETSKPTVEALK
## [85] FFPSMISPDWEPSIIPSNK
## [86] FWPVFIDR
## [87] GENESIPFEER
## [88] GETEEDIFER
## [89] GPYPPPPTGIDNDVPLSEHGVEQAHELANYISK
## [90] GVGEWYKPDRPIIPEPATHEVMSK
## [91] IPLYVDR
## [92] LDVKPEMIFSSPFYR
## [93] LTMNGNTSFLTNGEEMNWTFMNAFEAGSDADIK
## [94] MAIENIYIAR
## [95] NGSCAIDK
## [96] SALGMNLLK
## [97] SNWLPK
## [98] TIMIVTHAATK
## 98 Levels: AFIDEQFPQGISEPIPLGFTFSFPASQNK ... YSGVCHTDLHAWHGDWPLPTK
https://biblio.ugent.be/publication/8502633
dim(XIC)## [1] 2175989 7
table(XIC$filename)##
## 20161115_02_G3.raw 20161115_03_GE4.raw
## 162934 179176
## 20161115_04_GE2_a.raw 20161115_05_G4_rep.raw
## 161972 171804
## 20161115_05_G4.raw 20161115_07_GE3.raw
## 175169 169229
## 20161115_08_G1_a.raw 20161115_09_G2_a_rep.raw
## 167397 165653
## 20161115_09_G2_a.raw 20161115_10_GE1_a_rep.raw
## 129779 178308
## 20161115_12_GE1_b.raw 20161115_13_G1_b.raw
## 176766 168620
## 20161115_14_G2_b.raw
## 169182
table(XIC$protein)##
## P00331_ADH2 P04807_HXKB P33302_PDR5 P53929_YNK8
## 276315 429786 1032880 437008
table(XIC$peptide)##
## AFIDEQFPQGISEPIPLGFTFSFPASQNK
## 5125
## AGHWAAISGAAGGLGSLAVQYAK
## 22692
## AGTSLQGLQNQMLAVFMFTVIFNPILQQYLPSFVQQR
## 353
## AIIFYESNGK
## 53741
## ANGTVVLVGLPAGAK
## 29472
## ARPSSPYTVSYMMQVK
## 16391
## ATNGGAHGIINVSVSEAAIEASTR
## 7015
## AVQSELDWMER
## 41919
## CADYELLEFTPPSGMTCGQYMEPYLQLAK
## 939
## CLETSKPTVEALK
## 32162
## CPADANPAEWMLEVVGAAPGSHANQDYYEVWR
## 1065
## CSSDVFNHVVK
## 43289
## DIVSAVVK
## 26211
## DIYGWTQTSLDDYPIK
## 25095
## EELFTSLGGEVFIDFTK
## 17603
## ELMQQIENFEK
## 59988
## EMNDYWVK
## 46793
## ESGDFLAIDLGGTNLR
## 24259
## ESYANHLAEVAMATYGLSHTR
## 14417
## ETNTFQILKPMDGCLNPGELLVVLGRPGSGCTTLLK
## 319
## FDKPFVMDTSYPAR
## 22482
## FFPSMISPDWEPSIIPSNK
## 21194
## FILTIFNQLFIGFTFFK
## 18242
## FPCAEYVPR
## 37651
## FWPVFIDR
## 40367
## GDTSTFYFR
## 54216
## GEILVFPR
## 43300
## GENESIPFEER
## 44835
## GETEEDIFER
## 31588
## GEVVYNAEADVHLPHLTVFETLVTVAR
## 1485
## GFDIPNIENHDVVPMLQK
## 28947
## GFGIGMAYVVFFFFVYLFLCEYNEGAK
## 6204
## GIHIPQTPK
## 47025
## GPAYANISSTESVCTVVGAVPGQDYVLGDDFIR
## 419
## GPYPPPPTGIDNDVPLSEHGVEQAHELANYISK
## 590
## GSAMFFAILFNAFSSLLEIFSLYEARPITEK
## 3493
## GSITAAEDK
## 52905
## GTYQYYHK
## 40576
## GVGEWYKPDRPIIPEPATHEVMSK
## 2881
## HEFSQSIIYQTK
## 52460
## IEEDPFENLEDTDDLFQNEFGINTTVQER
## 735
## IFTVPTETLQAVTK
## 21020
## IPLYVDR
## 31294
## ISYSGYSGDDIK
## 23666
## IVPAEDGSGAGAAVIAALAQK
## 11091
## LALMDMYK
## 59050
## LANHGQAILCTIHQPSAILMQEFDR
## 2272
## LDPNSENFSSAAWVK
## 20111
## LDVKPEMIFSSPFYR
## 41262
## LIIAVCFNIIFYFLVDFR
## 8684
## LLNDDEASR
## 48149
## LLVFLDEPTSGLDSQTAWSICQLMK
## 10751
## LNNNVNDVTSYSSASSSTENAADLHNYNGFDEHTEAR
## 850
## LSDDIPPSAPMAINCEYGSFDNEHVVLPR
## 1518
## LSELIGAR
## 38247
## LSVCGIAAICQK
## 35360
## LTIGVELTAKPK
## 18503
## LTMNGNTSFLTNGEEMNWTFMNAFEAGSDADIK
## 1963
## MAIENIYIAR
## 36082
## MGVIFGTGVNGAYYDVCSDIEK
## 5040
## MLQESSEEESDTYGEIGLSK
## 7737
## MSSGYYLGEILR
## 27427
## NANDPENVGER
## 18018
## NGGVFFFYLLINIVAVFSMSHLFR
## 2855
## NGSCAIDK
## 19248
## NIPIEVVALINDTTGTLVASYYTDPETK
## 2074
## NLCYEVQIK
## 27020
## NLSASGASADVAYQSTVVNIPYK
## 5798
## NMAHLSAADPDFYKPYSLGCAWK
## 13007
## NNIGFTLFMILGNCSMALILGSMFFK
## 9170
## NYGIFICYIAFNYIAGVFFYWLAR
## 7379
## QPAEVSIEEK
## 67556
## QTTADFLTSVTSPSER
## 21208
## SALGMNLLK
## 61605
## SDAQSIFSSGVEGVNPIFSDPEAPGYDPK
## 1451
## SDLSSDR
## 42454
## SISSNTHGFDLGADTK
## 22661
## SNWLPK
## 32024
## TGHIAADGSVYNR
## 41974
## TGYLTDENATDTCSFCQISTTNDYLANVNSFYSER
## 354
## TIMIVTHAATK
## 39913
## TLSEAMVPASMLLLALSMYTGFAIPK
## 2250
## TLTAQSMQNSTQSAPNK
## 14488
## TMIDYFESHGAHK
## 24844
## TQADISNTSATVAIYQCSQDAYDLFNK
## 1509
## TTQNPDELWEFIADSLK
## 9930
## TYSLYHPSADAFASVLSEIPSK
## 5381
## VCVLDDGYQIYYGPADK
## 23549
## VGNDIVR
## 30715
## VLGIDGGPGK
## 36935
## VSPLTYFIQALLAVGVANVDVK
## 6610
## VTMGVITGDILVNGIPR
## 19452
## VVGLSSLPEIYEK
## 26241
## WIWYINPLAYLFESLLINEFHGIK
## 1482
## YADAVVGVAGEGLNVEQR
## 28924
## YDITIDEESPRPGQQTFEK
## 10424
## YFEDMGYVCPSR
## 13850
## YSGVCHTDLHAWHGDWPLPTK
## 13116
xic.filter <- XIC$filename == '20161115_13_G1_b.raw' & XIC$peptide == 'IFTVPTETLQAVTK'
lattice::xyplot(count ~ rt | as.character(charge) * filename,
data = XIC,
subset = xic.filter, #(XIC$peptide == 'IFTVPTETLQAVTK'),
type = 'h')lattice::xyplot(count ~ rt | as.character(charge) * filename,
data = XIC,
subset = xic.filter, #(XIC$peptide == 'IFTVPTETLQAVTK'),
xlim=c(62,65),
type = 'h')peptide.rt$pim <- sapply(as.character(peptide.rt$peptide), parentIonMass)
lattice::xyplot(pim ~ rt,
group = filename,
data=peptide.rt,
auto.key =TRUE)reqirement: query RT for every peptide and every LC-MS run
lattice::xyplot(peptide ~ rt | filename,
group=grepl('GE', filename),
data = peptide.rt)pmax <- aggregate(score ~ filename + peptide,
FUN=max,
data=peptide.rt)
peptide.rt.max <- merge(peptide.rt, pmax, by=c('peptide', 'filename', 'score'))View(peptide.rt.max)dim(XIC)## [1] 2175989 7
dim(peptide.rt.max)## [1] 192 5
XIC.rt <- merge(XIC, peptide.rt.max, by=c('peptide', 'filename'))
dim(XIC.rt)## [1] 462578 10
eps <- 0.5
filter <- XIC.rt$rt.y - eps < XIC.rt$rt.x & XIC.rt$rt.x < XIC.rt$rt.y + eps
XIC.rt.filtered <- XIC.rt[filter, ]
dim(XIC.rt.filtered)## [1] 8400 10
#library(lattice())
#xyplot(log(count,2) ~ rt|protein ,group=charge, data=XIC.f[grepl("20161115_12_GE1_b.raw", XIC.f$filename),], type='h')XIC.max <- aggregate(count ~ protein + peptide + charge + filename,
data=XIC.rt.filtered, FUN=max)
bwplot(count ~ filename | protein * grepl('GE', filename),
data=XIC.max,
scales = list(x = list(rot = 45)))bwplot(log(count) ~ filename | protein * grepl('GE', filename),
data=XIC.max,
scales = list(x = list(rot = 45)))bwplot( (log(count)-mean(log(count)))/sd(log(count)) ~ filename | protein * grepl('GE', filename),
data=XIC.max,
scales = list(x = list(rot = 45)))head(XIC.max)## protein peptide charge filename count
## 1 P04807_HXKB LALMDMYK 1 20161115_02_G3.raw 1264230.50
## 2 P04807_HXKB MSSGYYLGEILR 1 20161115_02_G3.raw 46869.84
## 3 P33302_PDR5 QPAEVSIEEK 1 20161115_02_G3.raw 160938.28
## 4 P53929_YNK8 SALGMNLLK 1 20161115_02_G3.raw 498726.30
## 5 P04807_HXKB DIYGWTQTSLDDYPIK 2 20161115_02_G3.raw 1324686.40
## 6 P04807_HXKB ELMQQIENFEK 2 20161115_02_G3.raw 3385561.50
M <- reshape2::dcast(XIC.max, formula = protein + peptide + charge ~ filename)## Using count as value column: use value.var to override.
head(M)## protein peptide charge 20161115_02_G3.raw
## 1 P00331_ADH2 AGHWAAISGAAGGLGSLAVQYAK 2 NA
## 2 P00331_ADH2 AGHWAAISGAAGGLGSLAVQYAK 3 NA
## 3 P00331_ADH2 AIIFYESNGK 1 NA
## 4 P00331_ADH2 AIIFYESNGK 2 NA
## 5 P00331_ADH2 AIIFYESNGK 3 NA
## 6 P00331_ADH2 ANGTVVLVGLPAGAK 1 NA
## 20161115_03_GE4.raw 20161115_04_GE2_a.raw 20161115_05_G4_rep.raw
## 1 749689.9 899255.1 NA
## 2 17331370.0 26432814.0 NA
## 3 2956808.8 4331269.5 NA
## 4 110262440.0 135853260.0 NA
## 5 313416.1 423874.6 NA
## 6 2570259.8 6858430.5 NA
## 20161115_05_G4.raw 20161115_07_GE3.raw 20161115_08_G1_a.raw
## 1 378555.3 462373.1 NA
## 2 11856046.0 6750584.0 NA
## 3 4937499.5 4534075.5 NA
## 4 158346720.0 160014270.0 NA
## 5 477027.6 462586.0 NA
## 6 3152056.2 3877851.8 NA
## 20161115_09_G2_a_rep.raw 20161115_10_GE1_a_rep.raw 20161115_12_GE1_b.raw
## 1 NA 443472.6 NA
## 2 NA 10076218.0 NA
## 3 NA 2833182.5 8303482.5
## 4 NA 122620136.0 243251620.0
## 5 NA 303788.5 637019.6
## 6 NA 3036921.0 12637748.0
## 20161115_13_G1_b.raw 20161115_14_G2_b.raw
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
rownames(M) <-paste(M[,1], M[,2], M[,3])
M[,1:3] <- NULL
M[is.na(M)] <- 0View(M)S.sum <- aggregate(count ~ protein + grepl('GE', filename) + filename, data=XIC.max, FUN=sum)
S.max <- aggregate(count ~ protein + grepl('GE', filename) + filename, data=XIC.max, FUN=max)
S.mean <- aggregate(count ~ protein + grepl('GE', filename) + filename, data=XIC.max, FUN=mean)
bwplot(log(S.sum$count) ~ S.sum$protein| grepl('GE', S.sum$filename),layout=c(2,1))bwplot(log(S.mean$count) ~ S.mean$protein| grepl('GE', S.mean$filename),layout=c(2,1))bwplot(log(S.max$count) ~ S.max$protein| grepl('GE', S.max$filename),layout=c(2,1))image(t(as.matrix(asinh(M))))library(gplots)##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
gplots::heatmap.2(asinh(as.matrix((M))), margins = c(20,15), trace = "none")lets switch to mq data …
gg <- c(rep('G', 6), rep('GE', 6))
dim(prxmq)## [1] 2515 12
table(gg)## gg
## G GE
## 6 6
op <- par(mfrow=c(1,3))
# example 0
idx <- 2
x <- log(unlist(prxmq[idx, ]),2)
boxplot(x ~ gg, main=row.names(prxmq)[idx])
t.test(x ~ gg, alternative = "two.sided")##
## Welch Two Sample t-test
##
## data: x by gg
## t = 0.16235, df = 8.7637, p-value = 0.8747
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2723130 0.3142348
## sample estimates:
## mean in group G mean in group GE
## 35.91961 35.89865
# example 1
idx <- which(row.names(prxmq) == "sp|P00331|ADH2_YEAST")
x <- log(unlist(prxmq[idx, ]),2)
boxplot(x ~ gg, main=row.names(prxmq)[idx])
t.test(x ~ gg, alternative = "two.sided")##
## Welch Two Sample t-test
##
## data: x by gg
## t = -17.864, df = 7.8783, p-value = 1.175e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.659519 -5.133147
## sample estimates:
## mean in group G mean in group GE
## 27.37343 33.26976
#example 2
idx <- which(row.names(prxmq) == "sp|P33302|PDR5_YEAST")
x <- log(unlist(prxmq[idx, ]),2)
boxplot(x ~ gg, main=row.names(prxmq)[idx])t.test(x ~ gg, alternative = "two.sided")##
## Welch Two Sample t-test
##
## data: x by gg
## t = 6.4453, df = 9.9542, p-value = 7.548e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 3.289644 6.769160
## sample estimates:
## mean in group G mean in group GE
## 27.64608 22.61667
expression_analysis <- function(S, groups){
S.t.test <- lapply(1:nrow(S), function(idx){
x <- unlist(S[idx, ])
x[x==0] <- NA
if (min(aggregate(x, by=list(groups), function(x){sum(!is.na(x))})$x) > 5){
t <- t.test(log(x, 2) ~ groups, alternative = "two.sided")
r <- cbind(FC = diff(t$estimate), p.value = t$p.value, idx=idx)
row.names(r) <- row.names(S)[idx]
r
}else{NULL}
})
S.t.test <- as.data.frame(do.call('rbind', S.t.test))
S.t.test
}
SwissProtID <- sapply(strsplit (as.character(unique(XIC$protein)), split='_'), function(x){x[1]})
sp_filter <- grepl(paste(SwissProtID, collapse ='|'), rownames(prxmq))
S.t.test <- expression_analysis(prxmq[sp_filter,], gg)
S.t.test## FC p.value idx
## sp|P00331|ADH2_YEAST 5.89633336 1.175307e-07 1
## sp|P04807|HXKB_YEAST 0.09064906 8.791641e-01 2
## sp|P33302|PDR5_YEAST -5.02940208 7.548265e-05 3
## sp|P53929|YNK8_YEAST -0.97289832 7.216296e-02 4
plot_volcano <- function(S.t.test, label=FALSE, ...){
plot(-log(p.value,10) ~ FC, data=S.t.test,
main="volcano plot",
sub="p1946 prx", ...)
abline(v=c(-0.5,0.5), col='grey')
abline(h=-log(0.025, 10), col='grey')
points(-log(p.value,10) ~ FC,
data=S.t.test[abs(S.t.test$FC)>0.5 & S.t.test$p.value<0.025,],
col='red',cex=0.5)
points(-log(p.value,10) ~ FC,
data=S.t.test[grepl('ZZ', rownames(S.t.test)),],
col='cyan')
points(-log(p.value,10) ~ FC,
data=S.t.test[grepl('REV', rownames(S.t.test)),],
cex=2, lwd=2,
col='green')
if(label){
text(S.t.test$FC, -log(S.t.test$p.value,10), rownames(S.t.test), cex=0.5, pos=3)
}
}
plot_volcano(S.t.test, xlim=c(-10,10))S.t.test <- expression_analysis(prxmq, gg)
plot_volcano(S.t.test, xlim=c(-10,10), label = FALSE)random_swap <- function(x){
a <- sample(length(x),2)
tmp <- x[a[1]]
x[a[1]] <- x[a[2]]
x[a[2]] <- tmp
return(x)
}op<-par(mfrow=c(2,2))
plot_volcano(S.t.test)
(gg <- random_swap(gg)); S.t.test <- expression_analysis(prxmq, gg); plot_volcano(S.t.test)## [1] "G" "GE" "G" "G" "G" "G" "G" "GE" "GE" "GE" "GE" "GE"
(gg <- random_swap(gg)); S.t.test <- expression_analysis(prxmq, gg); plot_volcano(S.t.test)## [1] "G" "GE" "G" "G" "G" "G" "G" "GE" "GE" "GE" "GE" "GE"
(gg <- random_swap(gg)); S.t.test <- expression_analysis(prxmq, gg); plot_volcano(S.t.test)## [1] "G" "GE" "G" "GE" "G" "G" "G" "G" "GE" "GE" "GE" "GE"